################################################################################ 
#
# The distribution of hate speech and its implications for content moderation
# PSRM - Replication package
# Table G17
#
################################################################################# 

library(readr)
library(dplyr)
library(tidyr)

rm(list = setdiff(ls(), ls(pattern = "^wd|^setsave$")))

set.seed(123456)

accepted_threshold <- 0.80
thresholds <- c(0.80)
thresholdnames <- c("80")

df_all <- readRDS(paste0(wd_data, "/twitter_experiment/tweets_experiment_appendix_anon.rds"))
df_all_llm <- readRDS(paste0(wd_data, "/twitter_experiment/tweets_experiment_llm_appendix_anon.rds"))


df_all %>% group_by(cs_ausgefuhrt) %>% summarise(n = n())
df_all %>% group_by(cs_ausgefuhrt) %>% summarise(n = n())

df_all <- df_all %>% filter(cs_ausgefuhrt == 1)
df_all_llm <- df_all_llm %>% filter(cs_ausgefuhrt == 1)

names(df_all)
names(df_all_llm)

# Example target group columns
target_columns <- c("geschlecht", "alter", "sexualitat", "religion", 
                    "nationalitat", "beeintrachtigung", "sozialer_status", 
                    "politik", "aussehen", "andere")

# Transform the dataset
df_long <- df_all %>%
  pivot_longer(cols = all_of(target_columns), 
               names_to = "target_group", 
               values_to = "value") %>%
  filter(!is.na(value)) %>%  # Keep only rows where the value is not NA
  select(-value)            # Drop the value column if no longer needed


df_long <- df_long %>% mutate(overall = "All Groups")

df_long <- df_long %>% 
  mutate(target_group = case_when(grepl("sozialer_status", target_group, ignore.case = T) ~ "Social Status",
                                  grepl("sexualitat", target_group, ignore.case = T) ~ "Sexuality",
                                  grepl("religion", target_group, ignore.case = T) ~ "Religion",
                                  grepl("politik", target_group, ignore.case = T) ~ "Politics",
                                  grepl("nationalitat", target_group, ignore.case = T) ~ "Nationality",
                                  grepl("geschlecht", target_group, ignore.case = T) ~ "Sex",
                                  grepl("beeintrachtigung", target_group, ignore.case = T) ~ "Disability",
                                  grepl("aussehen", target_group, ignore.case = T) ~ "Appearance",
                                  grepl("andere", target_group, ignore.case = T) ~ "Others",
                                  grepl("alter", target_group, ignore.case = T) ~ "Age",
                                  TRUE ~"ELSE"))

# Transform the dataset
df_long_llm <- df_all_llm %>%
  pivot_longer(cols = all_of(target_columns), 
               names_to = "target_group", 
               values_to = "value") %>%
  filter(!is.na(value)) %>%  # Keep only rows where the value is not NA
  select(-value)            # Drop the value column if no longer needed


df_long_llm <- df_long_llm %>% mutate(overall = "All Groups")

df_long_llm <- df_long_llm %>% 
  mutate(target_group = case_when(grepl("sozialer_status", target_group, ignore.case = T) ~ "Social Status",
                                  grepl("sexualitat", target_group, ignore.case = T) ~ "Sexuality",
                                  grepl("religion", target_group, ignore.case = T) ~ "Religion",
                                  grepl("politik", target_group, ignore.case = T) ~ "Politics",
                                  grepl("nationalitat", target_group, ignore.case = T) ~ "Nationality",
                                  grepl("geschlecht", target_group, ignore.case = T) ~ "Sex",
                                  grepl("beeintrachtigung", target_group, ignore.case = T) ~ "Disability",
                                  grepl("aussehen", target_group, ignore.case = T) ~ "Appearance",
                                  grepl("andere", target_group, ignore.case = T) ~ "Others",
                                  grepl("alter", target_group, ignore.case = T) ~ "Age",
                                  TRUE ~"ELSE"))

##################################################################################################
# 4) Make Table 
##################################################################################################
# Step 1: Determine if each user has ever tweeted hate speech
tmp <-  df_long %>%
  mutate(target_group = as.character(target_group)) %>%  # Ensure target_group is character
  bind_rows(
    df_long %>%
      mutate(target_group = "All Groups")  # Add "All Groups" category
  ) %>%
  mutate(target_group = factor(target_group, levels = c("All Groups", unique(df_long$target_group))))  # Reorder factor levels



# Step 1: Prepare data
tmp <- tmp %>%
  mutate(log_followers = log10(followers_count + 1),
         log_following = log10(friends_count + 1),
         log_listed = log10(listed_count + 1),
         log_status_count = log10(statuses_count + 1)) %>%
  mutate(anonymity_bool = ifelse(anonymity >= -1, 0, 1))

# Step 2: Define variables of interest
all_vars <- c("verified","anonymity_bool","country_given", "log_followers", "log_following","listed_count","log_status_count", "account_age_days","anonymity","entropy","name_matching")

# Step 3: Iterate over target groups (excluding "All Groups")
target_groups <- levels(tmp$target_group)


# Step 3: Group by target group and calculate means and SDs
summary_table <- tmp %>%
  group_by(target_group) %>%
  summarise(across(all_of(all_vars),
                   list(mean = ~mean(.x, na.rm = TRUE),
                        sd = ~sd(.x, na.rm = TRUE)),
                   .names = "{.col}_{.fn}")) %>%
  ungroup()

# Step 4: Pivot longer to have one row per target_group-variable combination
summary_long <- summary_table %>%
  pivot_longer(
    cols = -target_group,
    names_to = c("var_raw", "stat"),
    names_pattern = "^(.*)_(mean|sd)$"
  ) %>%
  pivot_wider(
    names_from = stat,
    values_from = value
  ) %>%
  rename(`Target Group` = target_group,
         Mean = mean,
         SD = sd)

##################################################################################################
# 5) DIFFERENCES AND SIGNIFICANCE 
##################################################################################################

# (a) Extract the reference (All Groups) means per variable
all_groups_means <- summary_long %>%
  filter(`Target Group` == "All Groups") %>%
  select(var_raw, Mean) %>%
  rename(all_mean = Mean)

# (b) Join the All Groups reference means back to the summary_long table
summary_long <- summary_long %>%
  left_join(all_groups_means, by = "var_raw") %>%
  mutate(Difference = Mean - all_mean)

# (c) Define a helper function to run a t-test between a given group and the "All Groups" reference.
#     This function takes the raw variable name (as in your dataset) and the target group.
get_p_value <- function(var_name, group_name) {
  if(group_name == "All Groups"){
    return(NA)  # Do not test "All Groups" against itself
  } else {
    # Extract the raw variable values from tmp for the specified group and for "All Groups"
    group_data <- tmp %>% filter(target_group == group_name) %>% pull(!!sym(var_name))
    all_data <- tmp %>% filter(target_group == "All Groups") %>% pull(!!sym(var_name))
    # Remove NA values if necessary
    group_data <- group_data[!is.na(group_data)]
    all_data <- all_data[!is.na(all_data)]
    t_res <- t.test(group_data, all_data)
    return(t_res$p.value)
  }
}

# (d) Compute the p-values and flag significance (using a threshold of 0.05)
#     We use rowwise() so that each row’s var_raw and target group is processed individually.
summary_long <- summary_long %>%
  rowwise() %>%
  mutate(p_value = get_p_value(var_raw, `Target Group`),
         Significant = ifelse(is.na(p_value), NA, ifelse(p_value < 0.05, "Yes", "No"))) %>%
  ungroup()

# (e) Optionally, recode the variable names to more reader‐friendly labels.
desired_order <- c(
  "Proportion of Verified Accounts",
  "Proportion of Verified Names",
  "User Name Entropy Score",
  "Personal Identification Score",
  "Profile contains a Country",
  "log(Followers Count)",
  "log(Friends Count)",
  "log(Status Count)",
  "No. of lists a user is member of",
  "Account Age (days)"
)

summary_long <- summary_long %>%
  filter(var_raw != "anonymity_bool") %>% 
  mutate(Variable = recode(var_raw,
                           verified = "Proportion of Verified Accounts",
                           name_matching = "Proportion of Verified Names",
                           entropy = "User Name Entropy Score",
                           anonymity = "Personal Identification Score",
                           country_given = "Profile contains a Country",
                           log_followers = "log(Followers Count)",
                           log_following = "log(Friends Count)",
                           log_status_count = "log(Status Count)",
                           listed_count = "No. of lists a user is member of",
                           account_age_days = "Account Age (days)"
                           )) %>%
  mutate(Variable = factor(Variable, levels = desired_order)) %>% 
  arrange(`Target Group`, Variable) %>% 
  select(`Target Group`, Variable, Mean, SD, Difference, p_value, Significant) %>%
  mutate(across(c(Mean, SD, Difference), ~ round(.x, 3)),
         p_value = ifelse(is.na(p_value), NA, round(p_value, 3)))

summary_long

# Step 8: Save to CSV
write.csv(summary_long, paste0(wd_res, "/tables/table_g17"), row.names = FALSE)

cat("\n====================\n")
cat("Saved Table G17")
cat("\n====================\n")